# Loading libraries
library(jsonlite)
library(tidyverse)
library(lubridate)
library(ggthemes)
library(scales)
library(hrbrthemes)
library(DT)
library(stringr)
library(qdap)
library(gridExtra)
library(rvest)
library(xlsx)
library(waffle)
library(cowplot)
library(tidytext)

1 Approval rating

1.1 Scrapen van data

Bron Gallup’s interactive graphic

Goal of script: Fetch Obama’s and Trump’s ‘Presidents approval rating’ data in JSON format, from Gallup’s interactive graphic and convert to tidy csv dataframe.

# Fetch data from Gallup's JSON pages behind their interactive approval rating graphic
trump_json <- fromJSON(txt = "http://news.gallup.com/wwwv7interactives/json/CURRENTPRESWEEKLY/codename.aspx?", 
                       flatten = TRUE)
obama_json <- fromJSON(txt = "http://news.gallup.com/wwwv7interactives/json/OBAMAEXPANDED/codename.aspx?", 
                       flatten = TRUE)
old_presidents_json <- fromJSON(txt = "http://news.gallup.com/wwwv7interactives/json/ALLPRESIDENTS/codename.aspx?", 
                                flatten = TRUE)
# Locate the wanted dataframe and convert to tidy dataframe
trump_df <- as.tibble(trump_json$CurrentPresident$data$date)
obama_df <- as.tibble(obama_json$ExpandedDemographics$data$date)
old_presidents_df <- as.tibble(old_presidents_json$AllPresidents$HistoricalPresident)

1.2 Trump dataset

# Tidy data: each variable its own column
trump_df <- trump_df %>%
  gather(key = "type", value = "rating", 4:47) %>%
  mutate(president = "Trump") %>%
  select(6, everything())
# Convert to correct data classes
trump_df$n <- as.numeric(trump_df$n)
trump_df$rating <- as.numeric(trump_df$rating)
trump_df$endDate <- date(trump_df$endDate)

# Uncomment and running the following line and you'll save a neat csv file in your current directory
# write.csv(trump_df, "trump_approval_rating.csv")
datatable(head(trump_df, n = nrow(trump_df)), options = list(pageLength = 5))
trump_df %>% 
  filter(type %in% c("Party.R", "Party.D", "Overall.A")) %>% # filter on GOP's, Dem's and everyone
  ggplot(aes(x = endDate, y = rating)) +
  geom_line(aes(colour = type),
            size = 1) +
  geom_text(aes(label = rating, colour = type),
            data = filter(trump_df, endDate == "2017-12-10" &
                                    (type == "Party.R" | 
                                     type == "Party.D" |
                                     type == "Overall.A")),
            vjust = -1.5,
            fontface = 2) +
  annotate(geom = "text", x = as.Date("2017-02-01"), hjust = 0, y= 78, label = "Republikeinen", colour = "#E31A1C", fontface = 2) +
  annotate(geom = "text", x = as.Date("2017-02-01"), hjust = 0, y= 47, label = "Totaal", colour = "grey", fontface = 2) +
  annotate(geom = "text", x = as.Date("2017-02-01"), hjust = 0, y= 15, label = "Democraten", colour = "#1F78B4", fontface = 2) +
  scale_x_date(breaks = date_breaks("1 month"),
               labels = date_format("%b")) +
  scale_colour_manual(values = c("grey", "#1F78B4", "#E31A1C"),
                      labels = c("Iedereen", "Democraten", "Republikeinen")) +
  theme_minimal() +
  theme(legend.position = "none") +
  labs(title = "Trump's eerste jaar approval ratings",
       y = "Approval rating %") 

1.3 Obama dataset

# Tidy data: each variable its own column
obama_df <- obama_df %>%
  gather(key = "type", value = "rating", 4:47) %>%
  mutate(president = "Obama") %>%
  select(6, everything())
# Convert to correct data classes
obama_df$n <- as.numeric(obama_df$n)
obama_df$rating <- as.numeric(obama_df$rating)
obama_df$endDate <- date(obama_df$endDate)
# Uncomment and running the following line and you'll save a neat csv file in your current directory
# write.csv(obama_df, "trump_approval_rating.csv")
# Pasting the two together
rating_comparison <- rbind(trump_df, obama_df)

datatable(head(rating_comparison, n = nrow(trump_df)), options = list(pageLength = 5))
# Plot
rating_comparison %>% 
  filter(type == "Overall.A") %>%
  group_by(president) %>% 
  mutate(Day = as.numeric(endDate - min(endDate))) %>% 
  ungroup() %>% 
  filter(Day <= 365) %>% 
  ggplot(aes(Day, rating)) + 
    geom_line(aes(color = president),
              size = 1.5) +
  theme_minimal() +
  theme(legend.position = 0) +
  annotate(geom = "text", x = 50, y= 32, label = "Trump", colour = "#E31A1C", fontface = 2) +
  annotate(geom = "text", x = 50, y= 70, label = "Obama", colour = "#1F78B4", fontface = 2) +
  ylim(0,100) +
  scale_colour_manual(values = c("#1F78B4", "#E31A1C")) +
  labs(title = "Approval rating Obama vs Trump eerste jaar",
       x = "Dagen in eerste jaar",
       y = "Rating in %")

# Difference obama en Trump
rating_comparison %>%
  filter(type %in% c("Race.W", "Race.N")) %>%
  ggplot(aes(x = endDate, y = rating, colour = type)) +
  geom_line() +
  scale_colour_manual(values = c("black", "#56B4E9")) +
  theme_minimal() +
  theme(legend.position = 0) +
  annotate(geom = "text", x = as.Date("2010-01-01"), hjust = 0, y= 80, label = "Niet-Blank", colour = "black", fontface = 2) +
  annotate(geom = "text", x = as.Date("2010-01-01"), hjust = 0, y= 48, label = "Blank", colour = "#56B4E9", fontface = 2) +
  labs(title = "Approval rating onder 'niet blanken'\nmaakt een vrije val sinds Trump")

# Three different etnicities during Trump
rating_comparison %>%
  filter(type %in% c("Race.W", "Race.B", "Race.H") &
           endDate > "2016-01-25") %>%
  ggplot(aes(x = endDate, y = rating, colour = type)) +
  geom_line(size = 1.2) +
  scale_colour_colorblind(labels = c("Afro-Amerikaans", "Hispanic", "Blank"),
                          name = "Etniciteit") +
  scale_x_date(date_breaks = "1 years", date_labels = "%Y") +
  theme_minimal() +
  theme(legend.position = 0) +
  annotate(geom = "text", x = as.Date("2016-02-01"), hjust = 0, y= 79, label = "Afro-Amerikaans", colour = "black") +
  annotate(geom = "text", x = as.Date("2016-02-01"), hjust = 0, y= 58, label = "Hispanic", colour = "#E69F00") +
  annotate(geom = "text", x = as.Date("2016-02-01"), hjust = 0, y= 32, label = "Blank", colour = "#56B4E9") +
  annotate(geom = "label", x = as.Date("2017-02-15"), hjust = 0, y= 80, label = "20 januari:\nTrump's inauguratie\nzorgt voor vrije val\nin goedkeuringscijfer\npresident bij niet-blanken", size = 3) +
  labs(title = "Approval rating onder 'niet blanken' maakt\neen vrije val sinds Trump",
       subtitle = "Uitgesplitst op etniciteit.",
       x = "")

# Three different etnicities during Trump
rating_comparison %>%
  filter(type %in% c("Education.HS", "Education.SC", "Education.CG", "Education.PG") &
           endDate > "2016-01-25") %>%
  ggplot(aes(x = endDate, y = rating, colour = type)) +
  geom_line(size = 0.8) +
  scale_colour_colorblind(labels = c("College Graduate",
                                     "High School or less",
                                     "Post Graduate",
                                     "Some College"),
                           name = "Education") +
  scale_x_date(date_breaks = "1 years", date_labels = "%Y") +
  theme_minimal() +
  labs(title = "Approval rating gefilterd op educatie",
       x = "")

2 Economie

  1. Daling Werkeloosheid
  2. Groei openstaande vacatures
  3. Flucturerende banangroei
  4. Stijgende import uit China

2.1 Daling werkeloosheid

# Labour statistis data page
url_werkeloosheid <- "https://data.bls.gov/timeseries/LNS14000000"
# Scrape html data with revest
unemployment_rate <- url_werkeloosheid %>%
  read_html() %>%
  html_nodes(xpath = "/html/body/div[5]/div[4]/div/table[2]") %>%
  html_table()
# Tidy data frame
unemployment_rate <- as.data.frame(unemployment_rate) %>%
  as.tbl() %>%
  gather(key = month, value = rate, 2:13) %>%
  filter(Year %in% c(2010:2017)) %>%
  arrange(desc(Year)) %>%
  mutate(president = ifelse(Year == 2017, "Trump", "Obama"),
         month = match(month, month.abb), # Create date strings that can be converted to date class
         month = paste(Year, month, sep = "-"),
         month = paste(month, "-1", sep = ""),
         month = ymd(month)) %>% # Converts to date clas with lubridate
  select(president, month, rate)
# Write csv
# write_csv(unemployment_rate, "unemployment_rate.csv")
# datatable(head(unemployment_rate, n = nrow(unemployment_rate)), options = list(pageLength = 5))

# Plot
plot_unemployment_rate <- unemployment_rate %>%
  ggplot(aes(x = month, y = rate, by = 1)) +
  geom_line(aes(colour = president),
            size = 1,
            na.rm = T) +
  geom_vline(xintercept = as.Date("2017-01-01"), colour = "#838B8B", linetype = "dashed") +
  scale_colour_manual(values = c("#1F78B4", "#E31A1C")) +
  theme_minimal() +
  annotate(geom = "text", x = as.Date("2014-07-01"), hjust = 0, y = 8.5, label = "Inauguratie Trump", colour = "#838B8B") +
  labs(title = "Onder Obama 2010 begint dalende\ntrend werkeloosheidspercentage",
       y  = "Percentage (%)",
       x = "")

plot_unemployment_rate

2.2 Groei openstaande vacatures

# Bureau of labour statistics url
url_vacatures <- "https://data.bls.gov/timeseries/JTS00000000JOR"
# Scraping table
job_openings <- url_vacatures %>%
  read_html() %>%
  html_nodes(xpath = "/html/body/div[5]/div[4]/div/table[2]") %>%
  html_table(fill = TRUE)

job_openings <- as.tibble(as.data.frame(job_openings)) %>%
  select(-14) %>% # Remove last weird redundant column
  gather(key = month, value = rate, 2:13) %>% # Transpose
  filter(Year %in% c(2009:2017)) %>% 
  arrange(desc(Year)) %>%
  mutate(president = ifelse(Year == 2017, "Trump", "Obama"),
         rate = gsub("\\(P\\)", "", rate),
         rate = as.numeric(rate),
         month = match(month, month.abb), # Create date strings that can be converted to date class
         month = paste(Year, month, sep = "-"),
         month = paste(month, "-1", sep = ""),
         month = ymd(month)) %>%
  select(president, month, rate)

# Plot
plot_job_openings <- job_openings %>%
  ggplot(aes(x = month, y = rate, by = 1)) +
  geom_line(aes(colour = president),
            size = 1,
            na.rm = T) +
  scale_colour_manual(values = c("#1F78B4", "#E31A1C")) +
  theme_minimal() +
  labs(title = "Klimmende trend % openstaande\nvacatures al vanaf begin Obama",
       y  = "Percentage (%)",
       x = "")

plot_job_openings

2.3 Flucturerende banangroei

  • Bron: Bureau of Labour Statistics. 1 Month Net Change | Employment Statistics survey (National) | All employees, thousands, total nonfarm, seasonally adjusted.
# url 
url_banengroei <- "https://data.bls.gov/timeseries/CES0000000001?output_view=net_1mth"
# Scrape data
job_growth <- url_banengroei %>%
  read_html() %>%
  html_nodes(xpath = "/html/body/div[5]/div[4]/div/table[2]") %>%
  html_table(fill = T)
# Convert into usable data.frame and remove last row which contains no data 
job_growth <- as.data.frame(job_growth)
job_growth <- job_growth[-12,]
# Create tidy dataframe
job_growth <- job_growth %>%
  select(-14) %>% # Remove last weird redundant column
  gather(key = month, value = employees, 2:13) %>% # Transpose
  filter(Year %in% c(2013:2017)) %>% 
  arrange(desc(Year)) %>%
  mutate(president = ifelse(Year == 2017, "Trump", "Obama"),
         employees = gsub("\\(P\\)", "", employees),
         employees = as.numeric(employees),
         month = match(month, month.abb), # Create date strings that can be converted to date class
         month = paste(Year, month, sep = "-"),
         month = paste(month, "-1", sep = ""),
         month = ymd(month)) %>%
  select(president, month, employees)

# Write csv
# write_csv(job_growth, "banengroei.csv")
# datatable(head(job_growth, n = nrow(job_growth)), options = list(pageLength = 5))

# Plot
plot_job_growth <- job_growth %>%
  ggplot(aes(x = month, y = employees)) +
  geom_line(aes(colour = president),
            size = 1,
            na.rm = T) +
  scale_colour_manual(values = c("#1F78B4", "#E31A1C")) +
  theme_minimal() +
  labs(title = "Groei in banen fluctueerd al sinds 2011\nrond de 200.000 banen",
       subtitle = "Verandering in aantal niet-boeren medewerkers x1000.", 
       y = "werknemers x1000",
       x = "")

plot_job_growth

2.4 Stijgende import uit China

# URL of census webpage
url_china <- "https://www.census.gov/foreign-trade/balance/c5700.html"
# Scrape all tables from CENSUS webpage
china_tbls_scraped <- url_china %>%
  read_html() %>%
  html_nodes("table") %>%
  html_table()
# Create one dataset from 33 tables 
china_tbl <- as.tibble(do.call(rbind, china_tbls_scraped))

# Trim redundant space in month column
china_tbl$Month <- str_trim(clean(china_tbl$Month))
# Delete Total rows and split Month column
china_tbl <-  china_tbl %>% filter(!grepl("TOTAL", Month)) %>%
  separate(Month, into = c("Month", "Year"), sep = " ")
# Create new Month column with class 'Date' for better computing
china_tbl <- china_tbl %>% 
  mutate(Month = match(Month, month.name),
         Month = paste(Year, Month, sep = "-"),
         Month = paste(Month, "-1", sep = ""),
         Month = ymd(Month)) %>%
  select(-2)
# Clean value columns from seperators and decimals now value is *billion dollars (miljard)
china_tbl$Exports <- gsub(",.*", "", china_tbl$Exports)
china_tbl$Imports <- gsub(",.*", "", china_tbl$Imports)
china_tbl$Balance <- gsub(",.*", "", china_tbl$Balance)
# Convert to numeric values
china_tbl <- china_tbl %>% mutate(Exports = as.double(Exports),
                     Imports = as.double(Imports),
                     Balance = as.double(Balance))
# Cerate tidy dataset 
china_tbl <- china_tbl %>% gather(key = Type, value = bln_dollars, 2:4)

# Plot1
plot_china_2000 <- china_tbl %>%
  filter(Type == "Imports",
         Month > "1999-12-01") %>%
  group_by(Month, "year") %>%
  ggplot() +
  geom_area(aes(x = Month, y = bln_dollars, group = 1),
            size = 1,
            fill = "#E53E22") +
  scale_x_date(date_breaks = "1 years", date_labels = "%y") +
  theme_minimal() +
  theme(plot.caption = element_text(colour = "#838B8B")) +
  labs(title = "Trump's Chinese draak",
       subtitle = "Oktober onder Trump kent de hoogste Chinese importcijfers\nin de geschiedenis van de VS. *Niet seizoensgecorrigeerd.",
       y = "miljard dollar",
       x = "",
       caption = "*Niet seizoensgecorrigeerd.\nBron: United States Census Bureau")

plot_china_2000

# Plot2
plot_china_2017 <- china_tbl %>% 
  filter(Type == "Imports" &
         Month > "2016-12-01") %>%
  ggplot() +
  geom_area(aes(x = Month, y = bln_dollars, group = 1),
            fill = "#E53E22") +
  scale_x_date(date_breaks = "1 month", date_labels = "%b") +
  theme_minimal() +
  theme(plot.caption = element_text(colour = "#838B8B")) +
  labs(title = "Chinese import onder Trump",
       subtitle = "In zijn eerste periode als president slaagt Trump er nog niet in\nde Chinese draak te temmen.",
       y = "miljard dollars",
       x = "",
       caption = "*Niet seizoensgecorrigeerd.\nBron: United States Census Bureau")

plot_china_2017

# Seasonally adjusted data from: https://www.census.gov/foreign-trade/statistics/country/index.html
# Download excel file from census webpage

# download.file("https://www.census.gov/foreign-trade/statistics/country/ctyseasonal.xlsx", destfile = "census_data.xlsx")

# Import file
xlsx_file <- read.xlsx("/Users/Thomas/Work/Volkskrant/Langetermijn_dossiers/1_jaar_Trump/census_data.xlsx", sheetName = "ctyseasonal") %>%
  as.tbl()
# Filter on China and per month
china_season_adj <- xlsx_file %>%
  filter(cty_desc == "China") %>%
  select(1, 3, c(IJAN:IDEC))
# Tidy data and create proper date column
china_season_adj <- china_season_adj %>%
  gather(key = month, value = import_dollars, IJAN:IDEC) %>%
  mutate(import_dollars = import_dollars/1000000000,
         month = gsub("I", "", month),
         month = str_to_title(tolower(month)), # first to lowercase then to proper case or else `month.abb` will not work.
         month = match(month, month.abb),
         month = paste(year, month, sep = "-"),
         month = paste(month, "-1", sep = ""),
         month = ymd(month)) %>%
  rename(country = cty_desc) %>%
  add_column(source = "https://www.census.gov/foreign-trade/statistics/country/index.html") %>%
  select(-1)

# Plot 3
china_season_adj %>%
  ggplot() +
  geom_area(aes(x = month, y = import_dollars, group = 1),
            fill = "#E53E22") +
  theme_minimal() +
  theme(plot.caption = element_text(colour = "#838B8B")) +
  labs(title = "Chinese import onder Trump",
       subtitle = "In zijn eerste periode als president slaagt Trump er nog niet in\nde Chinese draak te temmen.",
       y = "billion dollars",
       x = "",
       caption = "*Seizoensgecorrigeerd.\nBron: United States Census Bureau")

3 Inlossen van campagne beloftes

Data van Washington Post Trump Tracker. Op 05-01-2017 pagina gescraped met ‘guess’ functie in Outwit hub. Dan: export selection.

On Oct. 22, Trump issued what he called his “Contract with the American Voter.” This was a specific plan of action that would guide his administration, starting from the first day, and listed 60 promises. He even signed it with his distinctive signature. During Trump’s term, The Washington Post Fact Checker will track the progress of each pledge – and whether Trump has achieved his stated goal. Sign up for the weekly Fact Checker newsletter here.

# Import scraped data
outwit_df <- read.csv("/Users/Thomas/Work/Volkskrant/Langetermijn_dossiers/1_jaar_Trump/trump_campaign_promise_tracker_Washington_Post.csv")

# Summarise satus and count
summary_count <- outwit_df %>%
  select(c(2, 4, 5)) %>%
  as.tbl() %>%
  count(Status)
# Created vector variable for status order
order <- c("Promise broken","Stuck", "Compromise", "Promise kept", "Launched", "Not yet rated")
# Order Status on `order` values
summary_count <- summary_count[match(order, summary_count$Status),]
# Write csv
# write.csv(summary_count, "trump_campagne_beloftes.csv")

datatable(head(summary_count, n = nrow(summary_count)), options = list(pageLength = 5))

Er is ook een dataset met de details per beloftes.

# Create dataframe for details per belofte
details_per_belofte <- outwit_df %>%
  select(c(5, 2, 4)) %>%
  rename(Theme = Column.3,
         Promise = Column.5)
# Write csv
# write.csv(details_per_belofte, "details_per_belofte.csv")  

3.1 Meta data

meta_data <- tibble(status = c("Promise broken", "Launched", "Stuck", "Compromise", "Not yet rated"), betekenis = c("Trump failed to achieve his goal through inaction, congressional or legal obstacles or a reversal of policy", "Trump has taken action, such as proposing a bill or issuing an order, toward achieving this promise", "Trump has taken action, but Congress or the courts have put up roadblocks", "Trump did not achieve his goal, but accepted a deal that partially achieved his promise", "No action has yet been taken"))

# Write csv
# write.csv(meta_data, "meta_data_campagne_beloftes.csv")

datatable(head(meta_data, n = nrow(meta_data)), options = list(pageLength = 5))

3.2 Status van beloftes

# Add column with total 
summary_count <- add_column(summary_count, total = 60)
# Create colour variable for plots
colours <- c("#B71C1C", "#F57F17", "#2196F3", "#0D47A1", "#4CAF50", "#9E9E9E")
# Simple bar Chart
bar_chart <- summary_count %>%
  ggplot(aes(x = Status, y = n, fill = colours)) +
  geom_col() +
  coord_flip() +
  geom_text(aes(label = n), 
            hjust = 1.5,
            colour = "white",
            fontface = "bold",
            size = 3) +
  scale_x_discrete(limits = rev(order)) +
  scale_fill_identity() +
  theme_minimal() +
  theme(axis.text.x = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.major.y = element_blank(),
        legend.position = "null") +
  labs(title = "Status campagne beloftes",
       x = "",
       y = "Totaal van 60 beloftes")

# Bar chart with totals as grey bars
bar_chart_with_total <- summary_count %>%
  ggplot(aes(x = Status, y = n, fill = colours)) +
  geom_col(aes(y = total), fill = "grey90") +
  geom_col() +
  coord_flip() +
  geom_text(aes(label = n), 
            hjust = 1.5,
            colour = "white",
            fontface = "bold",
            size = 3) +
  scale_x_discrete(limits = rev(order)) +
  scale_fill_identity() +
  theme_minimal() +
  theme(axis.text.x = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.major.y = element_blank(),
        legend.position = "null") +
  labs(title = "Status campagne beloftes",
       x = "",
       y = "Totaal van 60 beloftes")

# All two plots in one grid
plot_grid(bar_chart, bar_chart_with_total, ncol = 1)

4 Vertrouwen van de wereld in President US

Cijfers van pew research center. Geïmporteerd vanaf google sheets. Originele data in pdf report.

meta <- read.csv("/Users/Thomas/Work/Volkskrant/Langetermijn_dossiers/1_jaar_Trump/wereld_vertrouwen/datasets/worlds-view-of-America - Meta.csv")
confidence_pres <- read.csv("/Users/Thomas/Work/Volkskrant/Langetermijn_dossiers/1_jaar_Trump/wereld_vertrouwen/datasets/worlds-view-of-America-sheet1.csv")
confidence_pres <- read.csv("/Users/Thomas/Work/Volkskrant/Langetermijn_dossiers/1_jaar_Trump/wereld_vertrouwen/datasets/worlds-view-of-America-sheet1.csv")

Grote dataset in de code

# Creating clean tibble dataframe
confidence_pres_tbl <- confidence_pres %>%
  gather(key = "year1", value = "conf", 2:16) %>%
  mutate(year = gsub("X", "", year1),
         conf_perc =  as.numeric(conf),
         country = as.character(X),
         year = as.numeric(year)) %>%
  select(c(6, 4, 5)) %>%
  arrange(desc(year)) %>%
  as.tbl()

4.1 Dataset gefilterd

Gefilterd op eerste jaar Obama (2009) en Trump (2017).

# Data frame containg first year Obama and first year Trump
first_year <- confidence_pres_tbl %>%
  filter(year == 2009 |
         year == 2017)
 # Filter out countries in 2009 and 2017 that have NA values in 2009.
first_year <- first_year %>% 
  group_by(country) %>% 
  filter(!any(is.na(conf_perc)))

first_year <- first_year %>%
  mutate(president = ifelse(year == 2017, "Trump", "Obama")) %>%
  select(4, everything())

first_year <- first_year %>%
  select(-year) %>%
  spread(key = president, value = conf_perc)
  
# write.csv(first_year, "/Users/Thomas/Work/Volkskrant/Langetermijn_dossiers/1_jaar_Trump/wereld_vertrouwen/datasets/first_year_obama_trump.csv")

datatable(head(first_year, n = nrow(first_year)), options = list(pageLength = 5))

4.2 Grafiek

first_year %>%
  ggplot() +
  geom_segment(aes(x = Trump,
                   y = reorder(country, -Trump),
                   xend = Obama,
                   yend = country),
               colour = "#AFACAC",
               size = 2) +
  geom_point(aes(x = Trump, 
                 y = reorder(country, -Trump)),
             colour = "#B42030",
             size = 3) +
  geom_point(aes(x = Obama, 
                 y = country),
             colour = "#3C3970",
             size = 3) +
  theme_minimal() +
  scale_colour_manual(values = c("blue", "red")) +
  labs(title = "Trump geniet substantieel minder\nvertrouwen dan Obama",
       subtitle = "Alleen in Rusland is Trump populairder.\nIn Israel is er geen verschil",
       x = "Vertrouwen in president %",
       y = "")

5 Trump’s tweetgewoonten

Een eerste jaar Donald J. Trump in het witte huis. Misschien wel mede mogelijk gemaakt door zijn karakteristieke uitlatingen op Twitter. In de aanloop van de verkiezingen zijn de tweets van de huidige president van de V.S. een krachtig middel gebleken. En nog steeds bereikt hij zijn miljoenen kiezers zonder de traditionele media nodig te hebben. Trump zelf verschijnt dan ook relatief weinig voor de camera, in kranten of andere news sites.

Om beter vat te krijgen op hoe Trump zijn favoriete medium inzet nemen we zijn tweets onder de loep. Doormiddel van een tekstanalyse hopen we een aantal vragen te beantwoorden.

Vragen zoals:

5.1 Data verkrijgen

De geïmporteerde dataset is van Trump Twitter Archive. Dit is een betere bron dan de Twitter API welke niet altijd alle tweets geeft. Bovendien slaat de Trump Twitter Archive ook de meeste verwijderde tweets op.

Als we alle tweets filteren op diegene waarvan wij denken dat ze afkomstig zijn van Trump bestaat de dataset uit bijna 7000 tweets vanaf 05-02-2013, vijf en half jaar geleden.

Trump Twitter Archive word elk uur geupdate. Als wij de code in onder de R-script op deze pagina ‘runnen’ dan worden alle data tabellen en grafieken als vanzelf geupdate. De twitter dataset bevat de volgende zeven variabelen:

  • text: tekst van de tweets.
  • created_at: datum en tijd van tweet in “GMT”
  • source: op welk apparaat of met welke software de tweet is gepost
  • retweet_count: aantal ‘retweets’
  • favourite_count: aantal ‘vind-ik-leuks’
  • is_retweet: of de tweet een retweet is of niet
  • id_str: uniek karakter van tweet
# Loading the used libraries
library(tidyverse)
library(lubridate)
library(tidytext)
library(DT)
library(scales)
library(hrbrthemes)
library(cowplot)
library(rvest)
library(ggthemes)

# url Trump Twitter Archive
url <- 'http://www.trumptwitterarchive.com/data/realdonaldtrump/%s.json'
# Retrieve all trump's tweets and create dataset with converted `created_at` character dates 
original_df <- map(2009:2017, ~sprintf(url, .x)) %>%
  map_df(jsonlite::fromJSON, simplifyDataFrame = TRUE) %>%
  mutate(created_at = parse_date_time(created_at, "a b! d! H!:M!:S! z!* Y!")) %>%
  tbl_df()
# If above doesn't work download data on website then: 
# original_df <- read.csv("filename.csv", quote = "", comment = "")

5.2 Klaarmaken voor analyse: Wanneer zijn de tweets afkomstig van Trump zelf?

Het uiteindelijke doel is om een dataset te maken met unieke Trump tweets. Omdat data scientist David Robinson in zijn analyse, Text analysis of Trump’s tweets confirms he writes only the (angrier) Android half, er vrijwel zeker van is dat Trump destijds een Android toestel gebruikte passen anderen machine learning toe.

Robinson concludeert een jaar later in een follow-up: Trump’s Android and iPhone tweets, one year later dat Trump vrijwel altijd tweet:

  • zonder links, hashtags of afbeeldingen
  • afkomstig van een Android toestel, tot hij een iPhone kocht, hoogst waarschijnlijk rond 25-03-2017.

Wij zullen dat in deze analyse overnemen door de computer op tweets te laten filteren waarvan de text hashtags (“#”) en/of links (“http”) bevatten. Afbeeldingen worden tegenwoordig niet meer in de text van de tweets meegenomen. Ook nemen we de tweets die vanaf andere toestellen afkomstig zijn niet mee.

Het spreekt voor zich dat we alleen originele tweets meenemen dus geen retweets. Bovendien zullen gelijk al filteren op de tweets die vanaf zijn inauguratie zijn verstuurd (20-101-2017).

# Subset data on high probability that Trump himself is actually tweeting and add
all_trump_tweets <- original_df %>%
  rename(retweets = retweet_count,
         favorites = favorite_count) %>%
  filter(is_retweet != "true",
         source == "Twitter for iPhone" |
         source == "Twitter for Android",
         !grepl("http|#|RT|@realdonaldtrump|@realDonaldTrump", text)) %>%
  rowid_to_column("ID") %>%
  select(ID, text, created_at, retweets, favorites, source)
# Filter out all iPhone tweets before 25-03-2017
all_trump_tweets <- all_trump_tweets %>%
  filter((created_at < "2017-03-15" & source != "Twitter for iPhone") | created_at >= "2017-03-15") %>%
  arrange(desc(created_at))
# Get rid of emoji characters R doesn't like them
all_trump_tweets$text <- gsub("[^\x01-\x7F]", "", all_trump_tweets$text)
# Remove redundant "amp"
all_trump_tweets$text <- gsub("amp", "", all_trump_tweets$text)
# Remove numbers
all_trump_tweets$text <- gsub("[0-9]+", "", all_trump_tweets$text)
# Convert GMT time to Eastern US time want Trump tweet daarvanuit het meest en originele tijden zijn GMT (Greenwich Time)
all_trump_tweets$created_at <- with_tz(all_trump_tweets$created_at, tzone = "US/Eastern")
# Create dataset since inauguration
president_tweets <- all_trump_tweets %>%
  filter(created_at > "2017-01-20")
# Create dataset since inauguration
candidate_tweets <- all_trump_tweets %>%
  filter(created_at > "2015-06-16")
# Html widget interactive table in rmarkdown report with only the four useful columns
datatable(head(president_tweets[,c(2,5,6,1,4)], n = nrow(president_tweets)), options = list(pageLength = 5))

5.3 Data analyseren

# Top 10 retweets
most_rt <- president_tweets %>%
  arrange(desc(retweets))
# Html widget interactive table in rmarkdown report
datatable(head(most_rt[,c(2, 4, 5, 3)], n = nrow(most_rt)), options = list(pageLength = 5)) 

5.3.1 Text analyse per woord

Welke woorden gebruikt Trump het meest in zijn tweets?

5.3.1.1 Vanaf Inauguratie

# Create new dataset `tweets_fy_words` (tweets first year words)
tweets_words <- president_tweets %>%
  unnest(text) %>% # Unnest gets rid of lists in text column
  unnest_tokens(word, text)
# Remove stop words and numbers, which aren't useful
tweets_words <- tweets_words %>%
  anti_join(stop_words) # Because in text is 'space' coded as "amp&"

# Sort on most used words
most_used_words <- tweets_words %>% 
  count(word, sort = TRUE) %>%
  mutate(times_used = n,
         word = reorder(word, times_used)) %>%
  select(word, times_used)
# Convert type word column to charcter for better handeling
most_used_words$word <- as.character(most_used_words$word)
# Sort on times_used
most_used_words <- most_used_words %>%
  arrange(desc(times_used))
# Plot in bar chart
plot_most_used_words <- most_used_words %>%
  top_n(20) %>%
  ggplot(aes(x = reorder(word, times_used), times_used)) +
    geom_segment(aes(x = reorder(word, times_used), xend = word, y = 0,yend = times_used),
               colour = "#737373") +
  geom_point(colour = "#03A9F4",
             size = 3) +
  coord_flip() +
  theme_minimal() +
  theme(panel.grid.major.y = element_blank(),
        plot.background = element_blank(),
        plot.title = element_text(face = "bold",
                                  size = 18)) +
  labs(title = "De 20 meest gebruikte woorden in Trump's tweets",
       subtitle = "Vanaf zijn inauguratie op 20-01-2017.",
       x = NULL, 
       y = "Aantal keer gebruikt")

# Plot
plot_most_used_words

# Html widget interactive table in rmarkdown report
datatable(head(most_used_words, n = nrow(most_used_words)), options = list(pageLength = 5)) 

5.3.2 Welke woordparen gebruikt Trump het vaakst?

candidate_bigrams <- candidate_tweets %>%
  unnest_tokens(bigram , text, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>%
  unite(bigram, word1, word2, sep = " ")

president_bigrams <- candidate_bigrams %>%
  filter(created_at > "2017-01-20") %>%
  group_by(bigram) %>%
  mutate(count = n())

head(president_bigrams %>%
  mutate(count = n()) %>%
  distinct(bigram, count) %>%
    arrange(desc(count)),20) %>%
  ggplot(aes(x = reorder(bigram, count), count)) +
    geom_segment(aes(x = reorder(bigram, count), xend = bigram, y = 0,yend = count),
               colour = "#737373") +
  geom_point(colour = "#03A9F4",
             size = 3) +
  coord_flip() +
  theme_minimal() +
  theme(panel.grid.major.y = element_blank(),
        plot.background = element_blank(),
        plot.title = element_text(face = "bold",
                                  size = 18)) +
  labs(title = "De 20 meest gebruikte woorden in Trump's tweets",
       subtitle = "Vanaf zijn inauguratie op 20-01-2017.",
       x = NULL, 
       y = "Aantal keer gebruikt")

5.3.3 Wanneer op de dag verstuurt Trump zijn tweets?

  • Trump vanaf toen hij president was
  • Trump vanaf dat hij zich officieel kandidaat stelde
  • Trump van vóór zijn kandidaatstelling
# Create dataframe with weekdays
weekday_tweets <- president_tweets %>%
  mutate(hour_of_day = hour(created_at),
         weekday = strftime(created_at, "%a")) %>%
  group_by(weekday, hour_of_day) %>%
  summarize(count = n()) %>%
  mutate(percentage = count / sum(count))
# Order weekdays on Monday first
weekday_tweets$weekday <- factor(weekday_tweets$weekday, levels = c("Mon", 
    "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"))
# Plot distributions
weekday_tweets %>%
  ggplot(aes(hour_of_day, count)) +
  geom_col(fill = "#03A9F4") +
  scale_x_continuous(breaks = seq(0,23,4)) +
  theme_minimal() +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title = element_text(face = "bold",
                                  size = 18)) +
  labs(title = "Een week in tweets van president Trump",
       subtitle = "Tweets per uur op de dag sinds zijn inauguratie.",
       x = "Uur van de dag",
       y = "Aantal tweets") +
  facet_wrap(~weekday) 

# -------------------President

# Create datframe that groups hours a day and the nr of tweets during his presidency and add percentage column
tweet_time_president <- all_trump_tweets %>%
  mutate(hour_of_day = hour(created_at)) %>%
  filter(created_at > "2017-01-20") %>% # Consists of 1.235 tweets
  group_by(hour_of_day) %>%
  summarize(count = n()) %>%
  mutate(percentage = count / sum(count))
# Plot a bar chart of a day 
plot_tweet_time_president <- tweet_time_president %>%
  ggplot(aes(hour_of_day, percentage)) +
  geom_col(fill = "#03A9F4") +
  theme_minimal() +
  scale_y_continuous(labels = percent_format(),
                     limits=c(0,0.2)) +
  scale_x_continuous(breaks = seq(0,23,2)) +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title = element_text(face = "bold",
                                  size = 18)) +
  labs(title = "President Trump",
       subtitle = "Bestaat uit 1.235 tweets.",
       x = "Uur van de dag",
       y = "% van tweets") 

# -------------------Candidate

# Same calculation than `tweet_time_president` but filterd before inauguration and after announcing official canidatcy
tweet_time_candidate <- all_trump_tweets %>%
  mutate(hour_of_day = hour(created_at)) %>%
  filter(created_at < "2017-01-20" &            
           created_at > "2015-06-16") %>% # Consists of 2.493 tweets
  group_by(hour_of_day) %>%
  summarize(count = n()) %>%
  mutate(percentage = count / sum(count))
# Plotting same barchart
plot_tweet_time_candidate <- tweet_time_candidate %>%
  ggplot(aes(hour_of_day, percentage)) +
  geom_col(fill = "#03A9F4") +
  theme_minimal() +
  scale_y_continuous(labels = percent_format(), 
                     limits=c(0,0.2)) +
  scale_x_continuous(breaks = seq(0,23,2)) +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title = element_text(face = "bold",
                                  size = 18)) +
  labs(title = "Kandidaat Trump",
       subtitle = "Bestaat uit 2.493 tweets.",
       x = "Uur van de dag",
       y = "% van tweets")  

# --------------------Before Politics

# Same calculation but filterd before announcing official canidatcy
tweet_time_before_politics <- all_trump_tweets %>%
  mutate(hour_of_day = hour(created_at)) %>%
  filter(created_at < "2015-06-16") %>% # Consists of 3.141 tweets
  group_by(hour_of_day) %>%
  summarize(count = n()) %>%
  mutate(percentage = count / sum(count))
# Plotting same barchart
plot_tweet_time_before_politics <- tweet_time_before_politics %>%
  ggplot(aes(hour_of_day, percentage)) +
  geom_col(fill = "#03A9F4") +
  theme_minimal() +
  scale_y_continuous(labels = percent_format(),
                     limits=c(0,0.2)) +
  scale_x_continuous(breaks = seq(0,23,2)) +
  theme(panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        plot.title = element_text(face = "bold",
                                  size = 18)) +
  labs(title = "Trump vóór officiële politieke carriére ",
       subtitle = "Bestaat uit 3.141 tweets.",
       x = "Uur van de dag",
       y = "% van tweets") 

# Place al three plot's in grid
plot_grid(plot_tweet_time_president, plot_tweet_time_candidate, plot_tweet_time_before_politics, ncol = 1, align = 'v')

5.3.4 Wanneer heeft Trump welke woordparen gebruikt

# Bubble time line
bubble_plot_1 <- candidate_bigrams %>%
  filter(created_at > "2017-01-01",
         grepl("fake news|fake media", bigram, ignore.case = TRUE)) %>%
  ggplot(aes(x = created_at, y = 0)) +
  geom_point(aes(size = retweets),
             alpha = 0.1,
             colour = "#B71C1C") +
  scale_size(range = c(0,15)) +
  scale_x_datetime(date_labels = "%b", date_breaks = "1 month") +
  theme_minimal() +
  theme(legend.position = "none",
        plot.background = element_blank(),
        axis.text.y = element_blank(),
        axis.text.x = element_text(hjust = -0.5),
        panel.grid = element_blank(),
        panel.grid.major.x = element_line(colour = "#BDBDBD",
                                          linetype = "dotted")) +
  labs(title = "Wanneer Trump 'Fake News/Media' in zijn tweets gebruikt",
       x = "",
       y = "")

bubble_plot_1

# Barcode with fixed size
barcode_1 <- candidate_bigrams %>%
  filter(created_at > "2017-01-01",
         bigram %in% c("fake news",
                       "tax cuts")) %>%
  ggplot(aes(x = created_at, y = 0, colour = bigram)) +
  geom_point(shape = 124,
             size = 15,
             alpha = 0.5) +
  scale_x_datetime(date_labels = "%b", date_breaks = "1 month") +
  theme_minimal() +
  theme(plot.background = element_blank(),
        axis.text.y = element_blank(),
        axis.text.x = element_text(hjust = -0.5),
        panel.grid = element_blank(),
        panel.grid.major.x = element_line(colour = "#BDBDBD",
                                          linetype = "dotted")) +
  labs(title = "Wanneer Trump 'Fake News' en 'Tax Cuts' in zijn tweets gebruikte",
       x = "",
       y = "")

barcode_1

plot_colours3 <- c("#b7002e", "#005cb7", "#baa400")

# Barcode where size is according number of retweets
barcode_2 <- candidate_bigrams %>%
  filter(created_at > "2017-01-01",
         bigram %in% c("fake news",
                       "north korea",
                       "tax cuts")) %>%
  ggplot(aes(x = created_at, y = 0, colour = bigram, size = retweets)) +
  geom_point(shape = 95,
             alpha = 0.4) +
  scale_size(range = c(1, 35)) +
  scale_x_datetime(date_labels = "%b",
                   date_breaks = "1 month") +
  scale_colour_manual(values = plot_colours3) +
  coord_flip() +
  theme_minimal() +
  theme(legend.position = "none",
        plot.background = element_blank(),
        axis.text.x = element_blank(),
        axis.text.y = element_text(vjust = -2.5),
        panel.grid = element_blank(),
        panel.grid.major.y = element_line(colour = "#BDBDBD",
                                          linetype = "dotted")) +
  labs(title = "Wanneer Trump zijn meest gebruikte\nwoordparen in zijn tweets gebruikt",
       subtitle = "Elk streepje is een tweet. De groter het streepje\ndes te meer retweets.",
       x = "",
       y = "")

barcode_2

plot_colours4 <- c("#ba001f", "#001fb7", "#b79900", "#00b799")

# Barcode where size is according to number of retweets and multiple words on their own line
barcode_3 <- candidate_bigrams %>%
  filter(created_at > "2017-01-01",
         bigram %in% c("fake news",
                       "north korea",
                       "tax cuts",
                       "failing nytimes")) %>%
  ggplot(aes(x = created_at, y = bigram, colour = bigram, size = retweets)) +
  geom_point(shape = 124,
             alpha = 0.6) +
  scale_size(range = c(1, 10)) +
  scale_x_datetime(date_labels = "%b", date_breaks = "1 month") +
  scale_colour_manual(values = plot_colours4) +
  theme_minimal() +
  theme(legend.position = "none",
        plot.background = element_blank(),
        axis.text.x = element_text(hjust = -0.5),
        panel.grid = element_blank(),
        panel.grid.major.x = element_line(colour = "#BDBDBD",
                                          linetype = "dotted")) +
  labs(title = "Wanneer Trump zijn meest gebruikte woordparen in zijn tweets gebruikt",
       subtitle = "Elk streepje is een tweet. De groter het streepje des te meer retweets.",
       x = "",
       y = "")

barcode_3

plot_colours4 <- c("#001fb7",  "#ba001f", "#b79900", "#00b799")

# Bubble plot where size is according to number of retweets and multiple words on their own line
candidate_bigrams %>%
  mutate(bigram = as.factor(bigram)) %>%
  filter(created_at > "2017-01-01",
         bigram %in% c("fake news",
                       "north korea",
                       "tax cuts",
                       "failing nytimes")) %>%
  ggplot(aes(x = created_at, y = bigram, fill = bigram, size = retweets)) +
  geom_point(shape = 21,
             alpha = 0.3,
             stroke = F) +
  scale_size(range = c(1, 12)) +
  scale_x_datetime(date_labels = "%b", date_breaks = "1 month") +
  scale_fill_manual(values = plot_colours4) +
  theme_minimal() +
  theme(legend.position = "none",
        plot.background = element_blank(),
        axis.text.x = element_text(hjust = -0.5),
        panel.grid = element_blank(),
        panel.grid.major.x = element_line(colour = "#BDBDBD",
                                          linetype = "dotted")) +
  labs(title = "Wanneer Trump zijn meest gebruikte woordparen in zijn tweets gebruikt",
       subtitle = "Elke bubbel is een tweet. De groter de bubbel des te meer retweets.",
       x = "",
       y = "")

6 Een fanatieke golfer

6.1 Scraped table

url <- "http://trumpgolfcount.com/displayoutings"
# Scrape html table
golf_df <- url %>%
  read_html() %>%
  html_nodes(css = "#table_id") %>%
  html_table(fill = T)
# Covert to tibble
golf_df <- as.tibble(as.data.frame(golf_df))
# Convert Date and numeric column  
golf_df$Date <- mdy(golf_df$Date)
# Html table widget
datatable(head(golf_df, n = nrow(golf_df)), options = list(pageLength = 5))